This analyses was conducted in 2020-02, and was updated on 2020-08-01 to use the new results files as input and to create an html report.

Setup

library(speedyseq)
library(tidyverse)
library(here)
library(cowplot)
library(ggridges)
library(ggbeeswarm)
library(ggforce)
theme_set(theme_minimal_grid(12))
# import::from(metacal, mutate_by, close_elts)
# Dev version of metacal for pairwise_ratios function
devtools::load_all("~/research/r-packages/metacal", helpers = FALSE)
my_psmelt <- function(physeq) {
  speedyseq::psmelt(physeq) %>%
    as_tibble %>%
    rename(taxon = OTU, sample = Sample, abundance = Abundance)
}
as_tibble.phyloseq <- function(x) {
  x %>% my_psmelt
}
as_tibble.sample_data <- function(x) {
  x %>% as("data.frame") %>% as_tibble(rownames = "sample")
}
as_tibble.taxonomyTable <- function(x) {
  x %>% as("matrix") %>% as_tibble(rownames = "taxon")
}
as_tibble.XStringSet <- function(x) {
  x %>% as.character %>% enframe("taxon", "sequence")
}

Load phyloseq object,

ps <- readRDS(here("results/a1/a1-phyloseq-silva-v138.Rds")) %>%
  print
#> phyloseq-class experiment-level object
#> otu_table()   OTU Table:         [ 367 taxa and 96 samples ]
#> sample_data() Sample Data:       [ 96 samples by 22 sample variables ]
#> tax_table()   Taxonomy Table:    [ 367 taxa by 7 taxonomic ranks ]
#> refseq()      DNAStringSet:      [ 367 reference sequences ]
sample_data(ps) <- sample_data(ps) %>%
  transform(extraction_batch = as.factor(extraction_batch))
pstb <- ps %>% as_tibble
tax <- tax_table(ps) %>% as_tibble
ref <- refseq(ps) %>% as_tibble
sam <- sample_data(ps) %>% as_tibble

Get custom taxonomic assignments based on exact matching to 16S references,

custom_species <- dada2::assignSpecies(
  ref$sequence, 
  here("strain-info", "inoculum-tmob-zymo-16s.fasta"),
  verbose = TRUE
  ) %>%
  as_tibble(rownames = "sequence") %>%
  rename_all(str_to_lower) %>%
  left_join(ref, by = "sequence")
#> 22 out of 367 were assigned to the species level.
custom_species %>% filter(!is.na(genus)) %>% print(n=Inf)
#> # A tibble: 22 x 4
#>    sequence                                           genus          species     taxon 
#>    <chr>                                              <chr>          <chr>       <chr> 
#>  1 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGT… Bacteroides    ovatus      A1_AS…
#>  2 TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGC… Escherichia/S… coli        A1_AS…
#>  3 TACAGAGGTCTCAAGCGTTGTTCGGAATCACTGGGCGTAAAGCGTGCGT… Akkermansia    muciniphila A1_AS…
#>  4 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGT… Bacteroides    uniformis   A1_AS…
#>  5 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGT… Bacteroides    thetaiotao… A1_AS…
#>  6 TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGT… Staphylococcus aureus      A1_AS…
#>  7 TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGT… Clostridium    symbiosum   A1_AS…
#>  8 TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGC… Roseburia      intestinal… A1_AS…
#>  9 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGT… Bacteroides    caccae      A1_AS…
#> 10 TACGGAGGGTGCAAGCGTTAATCGGAATCACTGGGCGTAAAGGGCGCGT… Thioflavicocc… mobilis     A1_AS…
#> 11 AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGC… Faecalibacter… prausnitzii A1_AS…
#> 12 TACGTAGGGGGCGAGCGTTATCCGGATTCATTGGGCGTAAAGCGCGCGT… Collinsella    aerofaciens A1_AS…
#> 13 TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGT… Marvinbryantia formatexig… A1_AS…
#> 14 TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGC… Eubacterium    rectale     A1_AS…
#> 15 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGT… Barnesiella    intestinih… A1_AS…
#> 16 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGT… Barnesiella    intestinih… A1_AS…
#> 17 TACGTAGGTGGCAAGCGTTATCCGGATTTATTGGGCGTAAAGAGAGTGC… Lactobacillus  fermentum   A1_AS…
#> 18 TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGC… Bacillus       subtilis    A1_AS…
#> 19 TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGC… Salmonella     enterica    A1_AS…
#> 20 TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGCGCGC… Listeria       monocytoge… A1_AS…
#> 21 TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGC… Enterococcus   faecalis    A1_AS…
#> 22 TACGAAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCGCGT… Pseudomonas    aeruginosa  A1_AS…

Filtering and merging

qplot(taxa_sums(ps)) +
  scale_x_log10()

ps0 <- ps %>%
  filter_taxa2(~sum(.) > 1000)

Find ASVs that seem to come from the same genome,

# correlations
asv_cor <- ps0 %>%
  transform_sample_counts(~ . / sum(.)) %>%
  otu_table %>%
  cor
# genetic distances
dna <- refseq(ps0)
nproc <- 4
aln <- DECIPHER::AlignSeqs(dna, processors = nproc, verbose = FALSE)
d <- DECIPHER::DistanceMatrix(aln, processors = nproc, verbose = FALSE)
# tbl to view distances and correlations
dist_tb <- list(cor = asv_cor, dist = d) %>%
  map(as_tibble, rownames = "taxon1") %>%
  map_dfr(pivot_longer, -taxon1, names_to = "taxon2", .id = "type") %>%
  pivot_wider(names_from = type, values_from = value) %>%
  mutate_at(paste0("taxon", 1:2), factor, ordered = TRUE, 
    levels = paste0("A1_ASV", seq(ntaxa(ps0)))) %>%
  filter(taxon1 < taxon2)
ggplot(dist_tb, aes(dist, cor)) +
  geom_point() +
  scale_x_log10()

dist_tb %>%
  filter(dist < 0.01, cor > 0.95)
#> # A tibble: 5 x 4
#>   taxon1   taxon2     cor    dist
#>   <ord>    <ord>    <dbl>   <dbl>
#> 1 A1_ASV1  A1_ASV8  0.989 0.00791
#> 2 A1_ASV9  A1_ASV17 0.997 0.00395
#> 3 A1_ASV13 A1_ASV26 0.997 0.00395
#> 4 A1_ASV19 A1_ASV20 0.996 0.00395
#> 5 A1_ASV23 A1_ASV29 0.985 0.00395

Merge these pairs by repeated calls to merge_taxa().

to_merge <- dist_tb %>%
  filter(dist < 0.01, cor > 0.95) %>%
  # select(taxon1, taxon2) %>%
  mutate_at(paste0("taxon", 1:2), as.character) %>%
  mutate(pair = map2(taxon1, taxon2, c)) %>%
  pull(pair)
ps1 <- purrr::reduce(
  to_merge, 
  .init = ps0,
  .f = merge_taxa
  )

TODO: Look into the two pairs with small distances but low (negative) correlations.

Abundance-based identification of inoculum taxa

Check identification

tax_table(ps0) %>% as_tibble %>%
  filter(genus == "Bacteroides") %>%
  select(taxon, genus, species)
#> # A tibble: 5 x 3
#>   taxon    genus       species                                            
#>   <chr>    <chr>       <chr>                                              
#> 1 A1_ASV1  Bacteroides fragilis/ovatus                                    
#> 2 A1_ASV4  Bacteroides uniformis                                          
#> 3 A1_ASV5  Bacteroides faecichinchillae/faecis/finegoldii/thetaiotaomicron
#> 4 A1_ASV8  Bacteroides caecimuris/fragilis/ovatus/xylanisolvens           
#> 5 A1_ASV10 Bacteroides caccae

So we can distinguish all the Bacteroides.

top_taxa_sums <- ps1 %>%
  subset_samples(specimen_type == "inoculum") %>% 
  taxa_sums %>%
  sort(decreasing = TRUE) %>%
  head(25)
top_taxa_sums
#>  A1_ASV2  A1_ASV6  A1_ASV9 A1_ASV12  A1_ASV7 A1_ASV13  A1_ASV3  A1_ASV4  A1_ASV5 
#>   759865   708938   243955   232970   222337   221722   181042   137149   126390 
#> A1_ASV15 A1_ASV10 A1_ASV16  A1_ASV1 A1_ASV18 A1_ASV24 A1_ASV11 A1_ASV19 A1_ASV23 
#>   104693    75971    53457    43959    15874    13138      543      207       11 
#> A1_ASV25 A1_ASV22 A1_ASV27 A1_ASV28 A1_ASV14 A1_ASV21 
#>        9        5        2        2        0        0
tax_table(ps1) %>%
  as_tibble %>%
  filter(taxon %in% names(top_taxa_sums)) %>%
  print(n=Inf)
#> # A tibble: 24 x 8
#>    taxon  kingdom  phylum    class    order   family   genus    species                
#>    <chr>  <chr>    <chr>     <chr>    <chr>   <chr>    <chr>    <chr>                  
#>  1 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Bactero… Bactero… <NA>                   
#>  2 A1_AS… Bacteria Proteoba… Gammapr… Entero… Enterob… Escheri… <NA>                   
#>  3 A1_AS… Bacteria Verrucom… Verruco… Verruc… Akkerma… Akkerma… muciniphila            
#>  4 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Bactero… Bactero… uniformis              
#>  5 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Bactero… Bactero… faecichinchillae/faeci…
#>  6 A1_AS… Bacteria Firmicut… Bacilli  Staphy… Staphyl… Staphyl… argenteus/aureus/capit…
#>  7 A1_AS… Bacteria Firmicut… Clostri… Lachno… Lachnos… Lachnoc… <NA>                   
#>  8 A1_AS… Bacteria Firmicut… Clostri… Lachno… Lachnos… Rosebur… <NA>                   
#>  9 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Bactero… Bactero… caccae                 
#> 10 A1_AS… Bacteria Proteoba… Gammapr… Chroma… Chromat… Thiofla… mobilis                
#> 11 A1_AS… Bacteria Firmicut… Clostri… Oscill… Ruminoc… Faecali… cf./prausnitzii        
#> 12 A1_AS… Bacteria Firmicut… Bacilli  Bacill… Bacilla… Bacillus <NA>                   
#> 13 A1_AS… Bacteria Proteoba… Gammapr… Entero… Enterob… Escheri… coli                   
#> 14 A1_AS… Bacteria Actinoba… Corioba… Coriob… Corioba… Collins… aerofaciens            
#> 15 A1_AS… Bacteria Firmicut… Clostri… Lachno… Lachnos… Marvinb… formatexigens          
#> 16 A1_AS… Bacteria Firmicut… Clostri… Lachno… Lachnos… Agathob… <NA>                   
#> 17 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Barnesi… Barnesi… intestinihominis       
#> 18 A1_AS… Bacteria Firmicut… Bacilli  Lactob… Lactoba… Lactoba… casei/curvatus/delbrue…
#> 19 A1_AS… Bacteria Firmicut… Bacilli  Bacill… Bacilla… Bacillus altitudinis/amylolique…
#> 20 A1_AS… Bacteria Proteoba… Gammapr… Entero… Enterob… Salmone… <NA>                   
#> 21 A1_AS… Bacteria Firmicut… Bacilli  Bacill… Bacilla… Bacillus acidicola/aerius/aerop…
#> 22 A1_AS… Bacteria Firmicut… Bacilli  Lactob… Listeri… Listeria innocua/ivanovii/marth…
#> 23 A1_AS… Bacteria Firmicut… Bacilli  Lactob… Enteroc… Enteroc… <NA>                   
#> 24 A1_AS… Bacteria Proteoba… Gammapr… Pseudo… Pseudom… Pseudom… aeruginosa/denitrifica…
inoc_taxa <- names(top_taxa_sums)[top_taxa_sums > 1000]
ps2 <- ps1 %>%
  subset_samples(specimen_type %in% c("inoculum", "fecal")) %>%
  prune_taxa(inoc_taxa, .)

This method might leave out ASVs that are very low abundance in the inoculum or poorly measured, but is ok for now. Note, I left in the Staph contaminant ASV6.

Picking ASVs to look at

ps2 %>%
  as_tibble %>%
  group_by(specimen_id, specimen_type, taxon) %>%
  summarize(across(abundance, sum), .groups = "drop") %>%
  mutate_by(specimen_id, proportion = abundance %>% close_elts) %>%
  mutate(taxon = factor(taxon, inoc_taxa)) %>%
  ggplot(aes(taxon, proportion + 1e-5, color = specimen_type)) +
  geom_quasirandom() +
  scale_y_log10() +
  coord_flip()

Roughly consistent with cross-sample contamination contributing roughly 1% of total reads.

From this figure, the following ASVs look like they are completely absent from the fecal specimens: 6, 12, 13, 24. (ASV6 is Staph).

taxa <- paste0("A1_ASV", c(6, 12, 13, 24))
tax_table(ps1)[taxa, 1:6]
#> Taxonomy Table:     [4 taxa by 6 taxonomic ranks]:
#>          kingdom    phylum       class        order              family             
#> A1_ASV6  "Bacteria" "Firmicutes" "Bacilli"    "Staphylococcales" "Staphylococcaceae"
#> A1_ASV12 "Bacteria" "Firmicutes" "Clostridia" "Oscillospirales"  "Ruminococcaceae"  
#> A1_ASV13 "Bacteria" "Firmicutes" "Bacilli"    "Bacillales"       "Bacillaceae"      
#> A1_ASV24 "Bacteria" "Firmicutes" "Bacilli"    "Bacillales"       "Bacillaceae"      
#>          genus             
#> A1_ASV6  "Staphylococcus"  
#> A1_ASV12 "Faecalibacterium"
#> A1_ASV13 "Bacillus"        
#> A1_ASV24 "Bacillus"
dist_tb %>%
  filter(taxon1 %in% taxa, taxon2 %in% taxa)
#> # A tibble: 6 x 4
#>   taxon1   taxon2     cor   dist
#>   <ord>    <ord>    <dbl>  <dbl>
#> 1 A1_ASV6  A1_ASV12 0.811 0.261 
#> 2 A1_ASV6  A1_ASV13 0.613 0.0395
#> 3 A1_ASV6  A1_ASV24 0.512 0.0711
#> 4 A1_ASV12 A1_ASV13 0.812 0.249 
#> 5 A1_ASV12 A1_ASV24 0.465 0.253 
#> 6 A1_ASV13 A1_ASV24 0.339 0.0593
custom_species %>%
  filter(taxon %in% taxa) %>%
  select(-sequence)
#> # A tibble: 4 x 3
#>   genus            species     taxon   
#>   <chr>            <chr>       <chr>   
#> 1 Staphylococcus   aureus      A1_ASV6 
#> 2 Faecalibacterium prausnitzii A1_ASV12
#> 3 <NA>             <NA>        A1_ASV13
#> 4 <NA>             <NA>        A1_ASV24

I don’t think a Bacillus should be in the inoculum. But it is in the Zymo mock.

Hypothesis: The zymo mock taxa are being read into the inoculum samples.

tb <- ps1 %>%
  prune_samples("A1_zymo_posc", .) %>%
  as_tibble %>%
  arrange(-abundance) %>%
  select(taxon:abundance, rank_names(ps1)) %>%
  print(n=Inf)
#> # A tibble: 24 x 10
#>    taxon  sample  abundance kingdom phylum  class  order  family  genus  species       
#>    <chr>  <chr>       <dbl> <chr>   <chr>   <chr>  <chr>  <chr>   <chr>  <chr>         
#>  1 A1_AS… A1_zym…     18326 Bacter… Firmic… Bacil… Staph… Staphy… Staph… argenteus/aur…
#>  2 A1_AS… A1_zym…     17238 Bacter… Proteo… Gamma… Enter… Entero… Salmo… <NA>          
#>  3 A1_AS… A1_zym…     15574 Bacter… Firmic… Bacil… Lacto… Lactob… Lacto… casei/curvatu…
#>  4 A1_AS… A1_zym…     15526 Bacter… Firmic… Bacil… Bacil… Bacill… Bacil… altitudinis/a…
#>  5 A1_AS… A1_zym…     12778 Bacter… Firmic… Bacil… Lacto… Lister… Liste… innocua/ivano…
#>  6 A1_AS… A1_zym…     11809 Bacter… Proteo… Gamma… Enter… Entero… Esche… <NA>          
#>  7 A1_AS… A1_zym…      9191 Bacter… Firmic… Bacil… Lacto… Entero… Enter… <NA>          
#>  8 A1_AS… A1_zym…      8502 Bacter… Proteo… Gamma… Pseud… Pseudo… Pseud… aeruginosa/de…
#>  9 A1_AS… A1_zym…       166 Bacter… Bacter… Bacte… Bacte… Bacter… Bacte… <NA>          
#> 10 A1_AS… A1_zym…        49 Bacter… Bacter… Bacte… Bacte… Bacter… Bacte… uniformis     
#> 11 A1_AS… A1_zym…        29 Bacter… Bacter… Bacte… Bacte… Bacter… Bacte… faecichinchil…
#> 12 A1_AS… A1_zym…        23 Bacter… Bacter… Bacte… Bacte… Bacter… Bacte… caccae        
#> 13 A1_AS… A1_zym…        17 Bacter… Firmic… Clost… Lachn… Lachno… Lachn… <NA>          
#> 14 A1_AS… A1_zym…        17 Bacter… Firmic… Clost… Lachn… Lachno… Roseb… <NA>          
#> 15 A1_AS… A1_zym…        13 Bacter… Verruc… Verru… Verru… Akkerm… Akker… muciniphila   
#> 16 A1_AS… A1_zym…         1 Bacter… Actino… Corio… Corio… Coriob… Colli… aerofaciens   
#> 17 A1_AS… A1_zym…         0 Bacter… Proteo… Gamma… Chrom… Chroma… Thiof… mobilis       
#> 18 A1_AS… A1_zym…         0 Bacter… Firmic… Clost… Oscil… Rumino… Faeca… cf./prausnitz…
#> 19 A1_AS… A1_zym…         0 Bacter… Firmic… Bacil… Bacil… Bacill… Bacil… <NA>          
#> 20 A1_AS… A1_zym…         0 Bacter… Proteo… Gamma… Enter… Entero… Esche… coli          
#> 21 A1_AS… A1_zym…         0 Bacter… Firmic… Clost… Lachn… Lachno… Marvi… formatexigens 
#> 22 A1_AS… A1_zym…         0 Bacter… Firmic… Clost… Lachn… Lachno… Agath… <NA>          
#> 23 A1_AS… A1_zym…         0 Bacter… Bacter… Bacte… Bacte… Barnes… Barne… intestinihomi…
#> 24 A1_AS… A1_zym…         0 Bacter… Firmic… Bacil… Bacil… Bacill… Bacil… acidicola/aer…
tb %>%
  filter(genus == "Bacillus") %>%
  select(taxon, abundance, genus)
#> # A tibble: 3 x 3
#>   taxon    abundance genus   
#>   <chr>        <dbl> <chr>   
#> 1 A1_ASV22     15526 Bacillus
#> 2 A1_ASV13         0 Bacillus
#> 3 A1_ASV24         0 Bacillus

Nope, since ASV13 and ASV24 are not found in the Zymo sample at all. Could have also guess this as they weren’t identified by the custom species assignment,

Pick focal taxa

These are the taxa that seem like they should actually be in the inoculum by design, and that are found in the fecal samples as well, minus ASV18 since it is generally quite rare

# focal_taxa <- paste0("ASV", c(1, 2, 3, 4, 5, 7, 9, 10, 15, 16, 18))
focal_taxa <- paste0("A1_ASV", c(1, 2, 3, 4, 5, 7, 9, 10, 15, 16))
ps3 <- prune_taxa(focal_taxa, ps2)

Visualize ratios of ratios

ps3.pw <- ps3 %>%
  transform_sample_counts(~ . + 1) %>%
  pairwise_ratios(margin = "taxa", filter = TRUE) %>%
  pairwise_ratios(margin = "samples", filter = FALSE) %>%
  subset_samples(specimen_id.1 == specimen_id.2) %>%
  subset_samples(extraction_protocol.1 == "1") %>%
  subset_samples(extraction_protocol.2 == "2")
pwtb <- ps3.pw %>%
  as_tibble %>%
  rename(pair = taxon, bias = abundance) %>%
  separate(pair, c("taxon.1", "taxon.2"), sep = ":", remove = FALSE)
pwtb %>%
  ggplot(aes(pair, log(bias), color = specimen_type.1)) +
  geom_quasirandom() +
  coord_flip() + 
  facet_grid(. ~ extraction_batch.1)

Look at just the Bacteroides,

bacteroides <- tax_table(ps3) %>% as_tibble %>%
  filter(genus == "Bacteroides") %>%
  pull(taxon)
bacteroides_pairs <- crossing(
  taxon1 = bacteroides, 
  taxon2 = bacteroides
  ) %>%
  unite("pair", taxon1, taxon2, sep = ":") %>%
  pull(pair)
pwtb %>%
  filter(pair %in% bacteroides_pairs) %>%
  ggplot(aes(pair, log(bias), 
      label = specimen_id.1, color = specimen_type.1)) +
  geom_quasirandom() +
  # geom_text(position = position_jitter(height = 0, width = 0.2)) +
  coord_flip() + 
  facet_wrap(~paste("batch", extraction_batch.1), nrow = 1)

Note, there is clustering by specimen_id.

Summary: - Bias between the four Bacteroides taxa is small - ASV1 appears to have a lower differential bias in the inoculum than in the fecal samples except in specimens 4 and 25. Notably, it is much rarer in the inoculum samples.

Look at the remaining pairs,

pwtb %>%
  filter(!(pair %in% bacteroides_pairs)) %>%
  ggplot(aes(pair, log(bias), 
      label = specimen_id.1, color = specimen_type.1)) +
  geom_quasirandom() +
  # geom_text(position = position_jitter(height = 0, width = 0.2)) +
  coord_flip() + 
  facet_wrap(~paste("batch", extraction_batch.1), nrow = 1)

Looks like ASV2 has a different bias in the inoc and fecal samples.

Look at just the E. coli pairs,

pwtb %>%
  # filter(!(pair %in% bacteroides_pairs)) %>%
  filter(str_detect(pair, "ASV2")) %>%
  ggplot(aes(pair, log(bias), 
      label = specimen_id.1, color = specimen_type.1)) +
  geom_quasirandom() +
  # geom_text(position = position_jitter(height = 0, width = 0.2)) +
  coord_flip() + 
  facet_wrap(~paste("batch", extraction_batch.1), nrow = 1) +
  theme(
    panel.spacing = unit(1, "cm")
  )

Look at E. coli versus bacteroides

pwtb %>%
  # filter(!(pair %in% bacteroides_pairs)) %>%
  filter(
    str_detect(pair, "ASV2"), 
    taxon.1 %in% bacteroides | taxon.2 %in% bacteroides
    ) %>%
  ggplot(aes(pair, log(bias), 
      label = specimen_id.1, color = specimen_type.1)) +
  geom_quasirandom(width = 0.2) +
  # geom_text(position = position_jitter(height = 0, width = 0.2)) +
  stat_summary(
    fun.data = mean_se, 
    position = position_nudge(x = 0.3),
    # shape = 124, 
    # fatten = 20
    ) +
  coord_flip() + 
  facet_wrap(~paste("batch", extraction_batch.1), nrow = 1) +
  theme(
    panel.spacing = unit(1, "cm")
  )

Clear pattern where E. coli has the same efficiency as Bacteroides in vivo but not in vitro.

To try to understand the outliers, it might be easier to look at samples rather than pairs of samples. Might be worth looking into whether the outliers could be due to cross-sample contamination.

Compositional PCA

pca <- ps3 %>%
  transform_sample_counts(~clr(. + 1)) %>%
  otu_table %>%
  prcomp
x <- pca$x %>% 
  as_tibble(rownames = "sample") %>%
  left_join(sample_data(ps3) %>% as_tibble, by = "sample")
# percent variance explained by each pc
pca_perc <- (pca$sdev^2) %>% close_elts %>% {round(. * 100, 1)} %>% print
#>  [1] 58.0 22.7  7.9  5.0  3.1  1.6  1.2  0.4  0.2  0.0
ggplot(x, aes(PC1, PC2, 
    shape = extraction_batch, 
    # color = specimen_type)) +
    color = extraction_protocol)) +
  geom_point(size = 3) +
  scale_color_brewer(type = "qual", palette = 2) +
  scale_shape_manual(values = c(3, 6)) +
  geom_mark_ellipse(aes(group = specimen_type, label = specimen_type),
    color = "grey") +
  coord_fixed(ratio = pca_perc[2] / pca_perc[1])

PC1 - specimen type, lesser extent: protocol PC2 - batch + type interaction

Next, try centering each specimen, weighting the two protocols equally, and redoing the PCA, so as to focus in on bias.